We want to look at real datasets to simulate realistic datasets.
data <- read.table("expression_mRNA_17-Aug-2014.txt", sep='\t', stringsAsFactors = FALSE, comment.char = '%')
level1 <- as.factor(as.matrix(data)[9,-(1:2)])
tissue <- as.factor(as.matrix(data)[1,-(1:2)])
table(tissue)
## tissue
## ca1hippocampus sscortex
## 1314 1691
group <- as.factor(as.matrix(data)[2,-(1:2)])
table(tissue, group)
## group
## tissue 1 2 3 4 5 6 7 8 9
## ca1hippocampus 126 20 919 121 14 18 64 17 15
## sscortex 164 370 29 699 84 157 134 9 45
nmolecule <- as.numeric(as.matrix(data)[3,-(1:2)])
sex <- as.factor(as.matrix(data)[5,-(1:2)])
table(tissue, sex)
## sex
## tissue -1 0 1
## ca1hippocampus 748 46 520
## sscortex 776 0 915
level2 <- as.factor(as.matrix(data)[10,-(1:2)])
table(level1)
## level1
## astrocytes_ependymal endothelial-mural interneurons
## 224 235 290
## microglia oligodendrocytes pyramidal CA1
## 98 820 939
## pyramidal SS
## 399
table(level1, level2)
## level2
## level1 (none) Astro1 Astro2 CA1Pyr1 CA1Pyr2 CA1PyrInt
## astrocytes_ependymal 65 68 61 0 0 0
## endothelial-mural 15 0 0 0 0 0
## interneurons 0 0 0 0 0 0
## microglia 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0
## pyramidal CA1 0 0 0 380 447 49
## pyramidal SS 109 0 0 0 0 0
## level2
## level1 CA2Pyr2 Choroid ClauPyr Epend Int1 Int10 Int11
## astrocytes_ependymal 0 10 0 20 0 0 0
## endothelial-mural 0 0 0 0 0 0 0
## interneurons 0 0 0 0 12 21 10
## microglia 0 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0 0
## pyramidal CA1 41 0 0 0 0 0 0
## pyramidal SS 0 0 5 0 0 0 0
## level2
## level1 Int12 Int13 Int14 Int15 Int16 Int2 Int3 Int4 Int5
## astrocytes_ependymal 0 0 0 0 0 0 0 0 0
## endothelial-mural 0 0 0 0 0 0 0 0 0
## interneurons 21 15 22 18 20 24 10 15 20
## microglia 0 0 0 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0 0 0 0
## pyramidal CA1 0 0 0 0 0 0 0 0 0
## pyramidal SS 0 0 0 0 0 0 0 0 0
## level2
## level1 Int6 Int7 Int8 Int9 Mgl1 Mgl2 Oligo1 Oligo2 Oligo3
## astrocytes_ependymal 0 0 0 0 0 0 0 0 0
## endothelial-mural 0 0 0 0 0 0 0 0 0
## interneurons 22 23 26 11 0 0 0 0 0
## microglia 0 0 0 0 17 16 0 0 0
## oligodendrocytes 0 0 0 0 0 0 45 98 87
## pyramidal CA1 0 0 0 0 0 0 0 0 0
## pyramidal SS 0 0 0 0 0 0 0 0 0
## level2
## level1 Oligo4 Oligo5 Oligo6 Peric Pvm1 Pvm2 S1PyrDL
## astrocytes_ependymal 0 0 0 0 0 0 0
## endothelial-mural 0 0 0 21 0 0 0
## interneurons 0 0 0 0 0 0 0
## microglia 0 0 0 0 32 33 0
## oligodendrocytes 106 125 359 0 0 0 0
## pyramidal CA1 0 0 0 0 0 0 0
## pyramidal SS 0 0 0 0 0 0 81
## level2
## level1 S1PyrL23 S1PyrL4 S1PyrL5 S1PyrL5a S1PyrL6 S1PyrL6b
## astrocytes_ependymal 0 0 0 0 0 0
## endothelial-mural 0 0 0 0 0 0
## interneurons 0 0 0 0 0 0
## microglia 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0
## pyramidal CA1 0 0 0 0 0 0
## pyramidal SS 74 26 16 28 39 21
## level2
## level1 SubPyr Vend1 Vend2 Vsmc
## astrocytes_ependymal 0 0 0 0
## endothelial-mural 0 32 105 62
## interneurons 0 0 0 0
## microglia 0 0 0 0
## oligodendrocytes 0 0 0 0
## pyramidal CA1 22 0 0 0
## pyramidal SS 0 0 0 0
counts <- as.matrix(data[12:NCOL(data),-(1:2)]) ## why NCOL here, not not NROW?
counts <- matrix(as.numeric(counts), ncol=ncol(counts), nrow=nrow(counts))
rownames(counts) <- data[12:NCOL(data),1]
colnames(counts) <- data[8, -(1:2)]
select = level1 %in% c('pyramidal CA1', 'interneurons',
'astrocytes_ependymal', 'oligodendrocytes')
counts = counts[, select]
group = droplevels(group[select])
sex = sex[select]
tissue = tissue[select]
level1 = droplevels(level1[select])
level2 = level2[select]
table(tissue)
## tissue
## ca1hippocampus sscortex
## 1267 1006
table(tissue, group)
## group
## tissue 1 2 3 4 7 8
## ca1hippocampus 126 20 919 121 64 17
## sscortex 164 0 0 699 134 9
table(tissue, sex)
## sex
## tissue -1 0 1
## ca1hippocampus 710 46 511
## sscortex 444 0 562
table(level1)
## level1
## astrocytes_ependymal interneurons oligodendrocytes
## 224 290 820
## pyramidal CA1
## 939
table(level1, level2)
## level2
## level1 (none) Astro1 Astro2 CA1Pyr1 CA1Pyr2 CA1PyrInt
## astrocytes_ependymal 65 68 61 0 0 0
## interneurons 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0
## pyramidal CA1 0 0 0 380 447 49
## level2
## level1 CA2Pyr2 Choroid ClauPyr Epend Int1 Int10 Int11
## astrocytes_ependymal 0 10 0 20 0 0 0
## interneurons 0 0 0 0 12 21 10
## oligodendrocytes 0 0 0 0 0 0 0
## pyramidal CA1 41 0 0 0 0 0 0
## level2
## level1 Int12 Int13 Int14 Int15 Int16 Int2 Int3 Int4 Int5
## astrocytes_ependymal 0 0 0 0 0 0 0 0 0
## interneurons 21 15 22 18 20 24 10 15 20
## oligodendrocytes 0 0 0 0 0 0 0 0 0
## pyramidal CA1 0 0 0 0 0 0 0 0 0
## level2
## level1 Int6 Int7 Int8 Int9 Mgl1 Mgl2 Oligo1 Oligo2 Oligo3
## astrocytes_ependymal 0 0 0 0 0 0 0 0 0
## interneurons 22 23 26 11 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0 45 98 87
## pyramidal CA1 0 0 0 0 0 0 0 0 0
## level2
## level1 Oligo4 Oligo5 Oligo6 Peric Pvm1 Pvm2 S1PyrDL
## astrocytes_ependymal 0 0 0 0 0 0 0
## interneurons 0 0 0 0 0 0 0
## oligodendrocytes 106 125 359 0 0 0 0
## pyramidal CA1 0 0 0 0 0 0 0
## level2
## level1 S1PyrL23 S1PyrL4 S1PyrL5 S1PyrL5a S1PyrL6 S1PyrL6b
## astrocytes_ependymal 0 0 0 0 0 0
## interneurons 0 0 0 0 0 0
## oligodendrocytes 0 0 0 0 0 0
## pyramidal CA1 0 0 0 0 0 0
## level2
## level1 SubPyr Vend1 Vend2 Vsmc
## astrocytes_ependymal 0 0 0 0
## interneurons 0 0 0 0
## oligodendrocytes 0 0 0 0
## pyramidal CA1 22 0 0 0
vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]
detection_rate <- colSums(core>0)
coverage <- colSums(core)
col1 <- brewer.pal(9, "Set1")
col2 <- brewer.pal(8, "Set2")
colTissue <- col1[tissue]
colMerged <- col2[level1]
To speed up the computations, we will focus on the top 1,000 most variable genes.
par(mfrow = c(1,2))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(core == 0), xlim = c(0,4), ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
main = 'no filtering')
smoothScatter(mean, rowMeans(core == 0), xlim = c(0,4),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = '1,000 most variable genes')
Pink and glia cells have been removed. Fit data with K = 3, V intercepts in both Mu and Pi, commondispersion = FALSE, with X accounting for batch, qc.
print(system.time(zinb <- zinbFit(core, ncores = 3, K = 3,
commondispersion = FALSE)))
## user system elapsed
## 3473.381 1547.494 1999.622
If we simulate W from real data, it will look like that.
pairs(zinb@W, col=colMerged, pch=19, main="W\ncolor = level1")
pairs(zinb@W, col=colTissue, pch=19, main="W\ncolor = tissue")
gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]
df <- data.frame(gamma_mu = gamma_mu,
gamma_pi = gamma_pi,
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.8374696 0.9240038 0.9818697
## gamma_pi -0.8374696 1.0000000 -0.9695456 -0.8902028
## detection_rate 0.9240038 -0.9695456 1.0000000 0.9508003
## coverage 0.9818697 -0.8902028 0.9508003 1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi,
tissue = colTissue, level = colMerged)
gamma = melt(gamma)
ggplot(gamma, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(~ variable) + xlab('gamma')
ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
ggplot(gamma, aes(value, fill = level)) + geom_density(alpha = 0.2) +
facet_grid(~ variable) + xlab('gamma') + theme_bw()
ggplot(gamma, aes(value, fill = tissue)) + geom_density(alpha = 0.2) +
facet_grid(~ variable) + xlab('gamma') + theme_bw()
beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]
df <- data.frame(beta_mu_intercept = beta_mu,
beta_pi_intercept = beta_pi)
pairs(df, pch=19)
print(cor(df, method="spearman"))
## beta_mu_intercept beta_pi_intercept
## beta_mu_intercept 1.0000000 -0.6913616
## beta_pi_intercept -0.6913616 1.0000000
beta = data.frame(beta_mu_intercept = beta_mu, beta_pi_intercept = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(~ variable, scales = 'free') + xlab('beta')
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot(outlier.shape = NA) +
facet_grid(~ variable, scales = 'free') + coord_flip() +
theme_bw() + ylab('beta removing outliers') +
scale_x_discrete(breaks = c('', ''), drop=FALSE) +
scale_y_continuous(limits = c(min, max))
## Warning: Removed 1478 rows containing non-finite values (stat_boxplot).
pairs(t(zinb@alpha_mu), main = 'alpha_mu')
pairs(t(zinb@alpha_pi), main = 'alpha_pi')
df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
alpha_mu_2 = zinb@alpha_mu[2, ],
alpha_mu_3 = zinb@alpha_mu[3, ],
alpha_pi_1 = zinb@alpha_pi[1, ],
alpha_pi_2 = zinb@alpha_pi[2, ],
alpha_pi_3 = zinb@alpha_pi[3, ])
pairs(df, pch=19)
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_mu_3 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.00000000 -0.02687891 -0.02637747 -0.63880851 0.01041533
## alpha_mu_2 -0.02687891 1.00000000 0.05989897 0.11732876 -0.38999690
## alpha_mu_3 -0.02637747 0.05989897 1.00000000 0.20936722 -0.22065224
## alpha_pi_1 -0.63880851 0.11732876 0.20936722 1.00000000 -0.02304856
## alpha_pi_2 0.01041533 -0.38999690 -0.22065224 -0.02304856 1.00000000
## alpha_pi_3 0.26979545 -0.18480626 -0.61527315 -0.30730654 0.03685982
## alpha_pi_3
## alpha_mu_1 0.26979545
## alpha_mu_2 -0.18480626
## alpha_mu_3 -0.61527315
## alpha_pi_1 -0.30730654
## alpha_pi_2 0.03685982
## alpha_pi_3 1.00000000
Something strange happened for the dispersion.
par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
xlab = 'edgeR tagwise dispersion', main = 'Dispersion')
print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.05474489
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion', xlim = c(0,10))
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion', xlim = c(0,10))
par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 9))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 9))
par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=colMerged, ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), coverage, xlab="Average estimated log Mu", ylab="Coverage", pch=19, col=colMerged)
par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(0,4),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'True')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(0,4),
xlab = "Estimated Mean Expression", main = 'Estimated',
ylab = "Estimated Dropout Rate",ylim = c(0,1),
colramp = pal)
plot(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)),
xlab = "Estimated Mean Expression", xlim = c(0,4),
ylab = "Estimated Dropout Rate", pch=19,
ylim = c(0, 1), main = 'Estimated')